{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "file = r'D:\\工作相关内容\\公司项目\\禁限运危化品货源\\data\\20211206\\all.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "outputs": [
    {
     "data": {
      "text/plain": "           id                                               data label\n0      129657  S:到哪里的？\\nD:到哪里啊？到芜湖的芜湖官司的。\\nS:啊，丸子的是吧？嗯，装那个$$木...     是\n1      129658  D:啊，你那地方叫什么？\\nS:嗯。\\nS:15林华$$石河子$$15林华。\\nD:石河。\\...    不是\n2      129659  S:喂，哎你好。\\nD:你那个广元到那个啥？\\nD:啊，八道$$龙门$$交嘛是吧？\\nS:广...    不是\n3      129660  D:啊，对对对对，那你咋知道呢？\\nS:啊。\\nS:振华$$空气炸锅$$还是什么之类的。\\n...   不确定\n4      129661  D:知道吧？\\nS:啊，上车也是。\\nS:$$大润发$$。\\nS:啊，你好。\\nD:463啊。\\n    不是\n...       ...                                                ...   ...\n11016  140673  D:还有小的是吗？\\nS:小白理财真空的几吨货。\\nD:这捆也是$$空油罐$$是吧？就两米两...     是\n11017  140674  D:喂，你好。\\nS:在哪里？\\nD:那个高苏州常熟到济南济阳，这个$$日化用品香皂$$给多...     是\n11018  140675  D:喂，你好。\\nS:啊，你好，你说。\\nD:啊，问下到海口的那个$$空酒瓶$$给多少钱？\\...     是\n11019  140676  S:大黑。\\nD:哎，运单123组行吧，你要17个方吧？是。\\nS:西工，我的购物车一下子是...     是\n11020  140677  D:喂，你好。\\nS:哎，你好。\\nD:你这边有个$$灯杆灯具$$到慈溪的多少钱？\\nS:1...     是\n\n[11021 rows x 3 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>data</th>\n      <th>label</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>129657</td>\n      <td>S:到哪里的？\\nD:到哪里啊？到芜湖的芜湖官司的。\\nS:啊，丸子的是吧？嗯，装那个$$木...</td>\n      <td>是</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>129658</td>\n      <td>D:啊，你那地方叫什么？\\nS:嗯。\\nS:15林华$$石河子$$15林华。\\nD:石河。\\...</td>\n      <td>不是</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>129659</td>\n      <td>S:喂，哎你好。\\nD:你那个广元到那个啥？\\nD:啊，八道$$龙门$$交嘛是吧？\\nS:广...</td>\n      <td>不是</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>129660</td>\n      <td>D:啊，对对对对，那你咋知道呢？\\nS:啊。\\nS:振华$$空气炸锅$$还是什么之类的。\\n...</td>\n      <td>不确定</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>129661</td>\n      <td>D:知道吧？\\nS:啊，上车也是。\\nS:$$大润发$$。\\nS:啊，你好。\\nD:463啊。\\n</td>\n      <td>不是</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>11016</th>\n      <td>140673</td>\n      <td>D:还有小的是吗？\\nS:小白理财真空的几吨货。\\nD:这捆也是$$空油罐$$是吧？就两米两...</td>\n      <td>是</td>\n    </tr>\n    <tr>\n      <th>11017</th>\n      <td>140674</td>\n      <td>D:喂，你好。\\nS:在哪里？\\nD:那个高苏州常熟到济南济阳，这个$$日化用品香皂$$给多...</td>\n      <td>是</td>\n    </tr>\n    <tr>\n      <th>11018</th>\n      <td>140675</td>\n      <td>D:喂，你好。\\nS:啊，你好，你说。\\nD:啊，问下到海口的那个$$空酒瓶$$给多少钱？\\...</td>\n      <td>是</td>\n    </tr>\n    <tr>\n      <th>11019</th>\n      <td>140676</td>\n      <td>S:大黑。\\nD:哎，运单123组行吧，你要17个方吧？是。\\nS:西工，我的购物车一下子是...</td>\n      <td>是</td>\n    </tr>\n    <tr>\n      <th>11020</th>\n      <td>140677</td>\n      <td>D:喂，你好。\\nS:哎，你好。\\nD:你这边有个$$灯杆灯具$$到慈溪的多少钱？\\nS:1...</td>\n      <td>是</td>\n    </tr>\n  </tbody>\n</table>\n<p>11021 rows × 3 columns</p>\n</div>"
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(file)\n",
    "df"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "outputs": [
    {
     "data": {
      "text/plain": "array(['是', '不是', '不确定', '是#是', '不是#不是#不是#不是#不是#不是#不是#不是',\n       '是#是#是#是#是#是#是#是', '不确定#不确定#不确定#不确定#不确定#不确定#不确定#不确定',\n       '是#是#是#是#是#是#是', '有疑问', nan], dtype=object)"
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['label'].unique()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "outputs": [],
   "source": [
    "def update_label(item):\n",
    "    #print(item)\n",
    "    if type(item[\"label\"]) != str:\n",
    "        return \"有疑问\"\n",
    "    sub_labels = item['label'].split(\"#\")\n",
    "    if len(sub_labels) == 1:\n",
    "        return sub_labels[0]\n",
    "    else:\n",
    "        return  \"有疑问\"\n",
    "df['label'] = df.apply(update_label,axis=1)\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "outputs": [
    {
     "data": {
      "text/plain": "array(['是', '不是', '不确定', '有疑问'], dtype=object)"
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['label'].unique()\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "outputs": [],
   "source": [
    "import re\n",
    "def make_contraband_word(item):\n",
    "    sub_list = item['data'].split(\"$$\")\n",
    "    if len(sub_list) != 3:\n",
    "        return \"\"\n",
    "    else:\n",
    "        return sub_list[1]\n",
    "df['contraband_word'] = df.apply(make_contraband_word,axis=1)\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "outputs": [
    {
     "data": {
      "text/plain": "         240\n美女        74\n核酸        64\n老虎        62\n垃圾        56\n        ... \n药渣         1\n西瓜子        1\n空调器        1\n袋大米        1\n超市购物车      1\nName: contraband_word, Length: 4893, dtype: int64"
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['contraband_word'].value_counts()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "outputs": [
    {
     "data": {
      "text/plain": "           id                                               data label  \\\n0      129657  S:到哪里的？\\nD:到哪里啊？到芜湖的芜湖官司的。\\nS:啊，丸子的是吧？嗯，装那个$$木...     是   \n1      129658  D:啊，你那地方叫什么？\\nS:嗯。\\nS:15林华$$石河子$$15林华。\\nD:石河。\\...    不是   \n2      129659  S:喂，哎你好。\\nD:你那个广元到那个啥？\\nD:啊，八道$$龙门$$交嘛是吧？\\nS:广...    不是   \n3      129660  D:啊，对对对对，那你咋知道呢？\\nS:啊。\\nS:振华$$空气炸锅$$还是什么之类的。\\n...   不确定   \n4      129661  D:知道吧？\\nS:啊，上车也是。\\nS:$$大润发$$。\\nS:啊，你好。\\nD:463啊。\\n    不是   \n...       ...                                                ...   ...   \n11016  140673  D:还有小的是吗？\\nS:小白理财真空的几吨货。\\nD:这捆也是$$空油罐$$是吧？就两米两...     是   \n11017  140674  D:喂，你好。\\nS:在哪里？\\nD:那个高苏州常熟到济南济阳，这个$$日化用品香皂$$给多...     是   \n11018  140675  D:喂，你好。\\nS:啊，你好，你说。\\nD:啊，问下到海口的那个$$空酒瓶$$给多少钱？\\...     是   \n11019  140676  S:大黑。\\nD:哎，运单123组行吧，你要17个方吧？是。\\nS:西工，我的购物车一下子是...     是   \n11020  140677  D:喂，你好。\\nS:哎，你好。\\nD:你这边有个$$灯杆灯具$$到慈溪的多少钱？\\nS:1...     是   \n\n      contraband_word  \n0                  木方  \n1                 石河子  \n2                  龙门  \n3                空气炸锅  \n4                 大润发  \n...               ...  \n11016             空油罐  \n11017          日化用品香皂  \n11018             空酒瓶  \n11019           超市购物车  \n11020            灯杆灯具  \n\n[10558 rows x 4 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>data</th>\n      <th>label</th>\n      <th>contraband_word</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>129657</td>\n      <td>S:到哪里的？\\nD:到哪里啊？到芜湖的芜湖官司的。\\nS:啊，丸子的是吧？嗯，装那个$$木...</td>\n      <td>是</td>\n      <td>木方</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>129658</td>\n      <td>D:啊，你那地方叫什么？\\nS:嗯。\\nS:15林华$$石河子$$15林华。\\nD:石河。\\...</td>\n      <td>不是</td>\n      <td>石河子</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>129659</td>\n      <td>S:喂，哎你好。\\nD:你那个广元到那个啥？\\nD:啊，八道$$龙门$$交嘛是吧？\\nS:广...</td>\n      <td>不是</td>\n      <td>龙门</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>129660</td>\n      <td>D:啊，对对对对，那你咋知道呢？\\nS:啊。\\nS:振华$$空气炸锅$$还是什么之类的。\\n...</td>\n      <td>不确定</td>\n      <td>空气炸锅</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>129661</td>\n      <td>D:知道吧？\\nS:啊，上车也是。\\nS:$$大润发$$。\\nS:啊，你好。\\nD:463啊。\\n</td>\n      <td>不是</td>\n      <td>大润发</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>11016</th>\n      <td>140673</td>\n      <td>D:还有小的是吗？\\nS:小白理财真空的几吨货。\\nD:这捆也是$$空油罐$$是吧？就两米两...</td>\n      <td>是</td>\n      <td>空油罐</td>\n    </tr>\n    <tr>\n      <th>11017</th>\n      <td>140674</td>\n      <td>D:喂，你好。\\nS:在哪里？\\nD:那个高苏州常熟到济南济阳，这个$$日化用品香皂$$给多...</td>\n      <td>是</td>\n      <td>日化用品香皂</td>\n    </tr>\n    <tr>\n      <th>11018</th>\n      <td>140675</td>\n      <td>D:喂，你好。\\nS:啊，你好，你说。\\nD:啊，问下到海口的那个$$空酒瓶$$给多少钱？\\...</td>\n      <td>是</td>\n      <td>空酒瓶</td>\n    </tr>\n    <tr>\n      <th>11019</th>\n      <td>140676</td>\n      <td>S:大黑。\\nD:哎，运单123组行吧，你要17个方吧？是。\\nS:西工，我的购物车一下子是...</td>\n      <td>是</td>\n      <td>超市购物车</td>\n    </tr>\n    <tr>\n      <th>11020</th>\n      <td>140677</td>\n      <td>D:喂，你好。\\nS:哎，你好。\\nD:你这边有个$$灯杆灯具$$到慈溪的多少钱？\\nS:1...</td>\n      <td>是</td>\n      <td>灯杆灯具</td>\n    </tr>\n  </tbody>\n</table>\n<p>10558 rows × 4 columns</p>\n</div>"
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df[df['contraband_word'] != \"\"]\n",
    "df = df[df['label'] != \"有疑问\"]\n",
    "df\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "outputs": [],
   "source": [
    "df.to_csv(\"save_to_mysql.csv\")"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}